Library upload

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.6
## ✔ ggplot2   4.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'janitor'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## 
## Loading required package: lattice
## 
## 
## Attaching package: 'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loaded glmnet 4.1-10
## 
## Loading required package: mvtnorm
## 
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## Type 'citation("pROC")' for a citation.
## 
## 
## Attaching package: 'pROC'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## 
## randomForest 4.7-1.2
## 
## Type rfNews() to see new features/changes/bug fixes.
## 
## 
## Attaching package: 'randomForest'
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Data upload

df <- readr::read_tsv("data/marketing_campaign.csv") %>%
  janitor::clean_names()
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Content

AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise Complain - 1 if customer complained in the last 2 years DtCustomer - date of customer’s enrolment with the company Education - customer’s level of education Marital - customer’s marital status Kidhome - number of small children in customer’s household Teenhome - number of teenagers in customer’s household Income - customer’s yearly household income MntFishProducts - amount spent on fish products in the last 2 years MntMeatProducts - amount spent on meat products in the last 2 years MntFruits - amount spent on fruits products in the last 2 years MntSweetProducts - amount spent on sweet products in the last 2 years MntWines - amount spent on wine products in the last 2 years MntGoldProds - amount spent on gold products in the last 2 years NumDealsPurchases - number of purchases made with discount NumCatalogPurchases - number of purchases made using catalogue NumStorePurchases - number of purchases made directly in stores NumWebPurchases - number of purchases made through company’s web site NumWebVisitsMonth - number of visits to company’s web site in the last month Recency - number of days since the last purchase

Exploratory Data Analysis

glimpse(df)
## Rows: 2,240
## Columns: 29
## $ id                    <dbl> 5524, 2174, 4141, 6182, 5324, 7446, 965, 6177, 4…
## $ year_birth            <dbl> 1957, 1954, 1965, 1984, 1981, 1967, 1971, 1985, …
## $ education             <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status        <chr> "Single", "Single", "Together", "Together", "Mar…
## $ income                <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635,…
## $ kidhome               <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ teenhome              <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ dt_customer           <chr> "04-09-2012", "08-03-2014", "21-08-2013", "10-02…
## $ recency               <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 11, 59, …
## $ mnt_wines             <dbl> 635, 11, 426, 11, 173, 520, 235, 76, 14, 28, 5, …
## $ mnt_fruits            <dbl> 88, 1, 49, 4, 43, 42, 65, 10, 0, 0, 5, 16, 61, 2…
## $ mnt_meat_products     <dbl> 546, 6, 127, 20, 118, 98, 164, 56, 24, 6, 6, 11,…
## $ mnt_fish_products     <dbl> 172, 2, 111, 10, 46, 0, 50, 3, 3, 1, 0, 11, 225,…
## $ mnt_sweet_products    <dbl> 88, 1, 21, 3, 27, 42, 49, 1, 3, 1, 2, 1, 112, 5,…
## $ mnt_gold_prods        <dbl> 88, 6, 42, 5, 15, 14, 27, 23, 2, 13, 1, 16, 30, …
## $ num_deals_purchases   <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ num_web_purchases     <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 1, 1, 2, 3, 6, 1, 7, …
## $ num_catalog_purchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 0, 0, 4, 1, 0, 6,…
## $ num_store_purchases   <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 0, 2, 3, 8, 5, 3, 1…
## $ num_web_visits_month  <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 7, 8, 2, 6, 8, 3,…
## $ accepted_cmp3         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp4         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp5         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp1         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ complain              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ z_cost_contact        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ z_revenue             <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ response              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
dim(df)
## [1] 2240   29

Only the “Income” variable has zero values. Since there are only 24 missing values, it was decided to eliminate them directly.

skimr::skim(df)
Data summary
Name df
Number of rows 2240
Number of columns 29
_______________________
Column type frequency:
character 3
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
education 0 1 3 10 0 5 0
marital_status 0 1 4 8 0 8 0
dt_customer 0 1 10 10 0 663 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 5592.16 3246.66 0 2828.25 5458.5 8427.75 11191 ▇▇▇▇▇
year_birth 0 1.00 1968.81 11.98 1893 1959.00 1970.0 1977.00 1996 ▁▁▂▇▅
income 24 0.99 52247.25 25173.08 1730 35303.00 51381.5 68522.00 666666 ▇▁▁▁▁
kidhome 0 1.00 0.44 0.54 0 0.00 0.0 1.00 2 ▇▁▆▁▁
teenhome 0 1.00 0.51 0.54 0 0.00 0.0 1.00 2 ▇▁▇▁▁
recency 0 1.00 49.11 28.96 0 24.00 49.0 74.00 99 ▇▇▇▇▇
mnt_wines 0 1.00 303.94 336.60 0 23.75 173.5 504.25 1493 ▇▂▂▁▁
mnt_fruits 0 1.00 26.30 39.77 0 1.00 8.0 33.00 199 ▇▁▁▁▁
mnt_meat_products 0 1.00 166.95 225.72 0 16.00 67.0 232.00 1725 ▇▁▁▁▁
mnt_fish_products 0 1.00 37.53 54.63 0 3.00 12.0 50.00 259 ▇▁▁▁▁
mnt_sweet_products 0 1.00 27.06 41.28 0 1.00 8.0 33.00 263 ▇▁▁▁▁
mnt_gold_prods 0 1.00 44.02 52.17 0 9.00 24.0 56.00 362 ▇▁▁▁▁
num_deals_purchases 0 1.00 2.33 1.93 0 1.00 2.0 3.00 15 ▇▂▁▁▁
num_web_purchases 0 1.00 4.08 2.78 0 2.00 4.0 6.00 27 ▇▃▁▁▁
num_catalog_purchases 0 1.00 2.66 2.92 0 0.00 2.0 4.00 28 ▇▂▁▁▁
num_store_purchases 0 1.00 5.79 3.25 0 3.00 5.0 8.00 13 ▂▇▂▃▂
num_web_visits_month 0 1.00 5.32 2.43 0 3.00 6.0 7.00 20 ▅▇▁▁▁
accepted_cmp3 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp4 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp5 0 1.00 0.07 0.26 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp1 0 1.00 0.06 0.25 0 0.00 0.0 0.00 1 ▇▁▁▁▁
accepted_cmp2 0 1.00 0.01 0.11 0 0.00 0.0 0.00 1 ▇▁▁▁▁
complain 0 1.00 0.01 0.10 0 0.00 0.0 0.00 1 ▇▁▁▁▁
z_cost_contact 0 1.00 3.00 0.00 3 3.00 3.0 3.00 3 ▁▁▇▁▁
z_revenue 0 1.00 11.00 0.00 11 11.00 11.0 11.00 11 ▁▁▇▁▁
response 0 1.00 0.15 0.36 0 0.00 0.0 0.00 1 ▇▁▁▁▂
df_clean <- df %>%
  mutate(
    across(c(
      id, education, marital_status,
      kidhome, teenhome,
      accepted_cmp1, accepted_cmp2, accepted_cmp3,
      accepted_cmp4, accepted_cmp5, response
    ), as.factor)
  )
df_clean <- df_clean %>% tidyr::drop_na(income)

sum(is.na(df_clean))
## [1] 0

The variables “z_cost_contact” and “z_revenue” have a constant value, as they do not add any information and are therefore eliminated from the analysis.

df %>%
  summarise(across(everything(), n_distinct)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "n_unique") %>%
  filter(n_unique == 1)
## # A tibble: 2 × 2
##   var            n_unique
##   <chr>             <int>
## 1 z_cost_contact        1
## 2 z_revenue             1
df_clean <- df %>% dplyr::select(-z_cost_contact, -z_revenue)

Two variables with almost zero variance were found and therefore eliminated.

nzv <- nearZeroVar(df_clean, saveMetrics = TRUE)
nzv[nzv$zeroVar | nzv$nzv, ]
##               freqRatio percentUnique zeroVar  nzv
## accepted_cmp2  73.66667    0.08928571   FALSE TRUE
## complain      105.66667    0.08928571   FALSE TRUE
df_clean <- df_clean[, !nzv$zeroVar & !nzv$nzv]

The “Yolo,” “Absurd,” and ‘Alone’ modes of the “marital_status” variable have been removed, as they have a very low presence and distort the analysis.

df_clean <- df_clean %>%
  filter(!marital_status %in% c("YOLO", "Absurd")) %>%
  mutate(marital_status = fct_recode(marital_status,
    "Single" = "Alone" 
  )) %>%
  mutate(marital_status = droplevels(marital_status))

df_clean <- df_clean %>%
  mutate(
    marital_status = fct_collapse(marital_status,
      "Divorced_Widow" = c("Divorced", "Widow")
    )
  )

count(df_clean, marital_status)
## # A tibble: 4 × 2
##   marital_status     n
##   <fct>          <int>
## 1 Single           483
## 2 Divorced_Widow   309
## 3 Married          864
## 4 Together         580

The variables relating to children and young people in households have been grouped together under a single variable, “Children.”

df_clean <- df_clean %>%
  mutate(children = kidhome + teenhome) %>%
  dplyr::select(-kidhome, -teenhome)
df_clean <- df_clean %>%
  mutate(
    total_spent =
      mnt_wines + mnt_fruits + mnt_meat_products +
        mnt_fish_products + mnt_sweet_products +
        mnt_gold_prods
  )
df_clean <- df_clean %>%
  mutate(age = 2024 - year_birth) %>%
  dplyr::select(-year_birth)

A logarithmic transformation was applied to the Income feature to correct for the strong positive asymmetry (right-skewness) typical of income distributions.

df_clean <- df_clean %>%
  mutate(income_log = log(income)) %>%
  dplyr::select(-income)


df_clean <- df_clean %>%
  na.omit(is.na(income_log))
df_clean <- df_clean %>%
  mutate(
    accepted_any = if_else(rowSums(dplyr::select(., starts_with("Accepted"))) > 0, 1, 0),
    accepted_any = as.factor(accepted_any)
  )

df_clean <- df_clean %>%
  dplyr::select(-accepted_cmp1, -accepted_cmp3, -accepted_cmp4, -accepted_cmp5)

table(df_clean$accepted_any)
## 
##    0    1 
## 1755  457
dim(df_clean)
## [1] 2212   22
df_clean %>%
  ggplot(aes(x = factor(response))) +
  geom_bar(fill = "#4E79A7", color = "white", alpha = 0.9, width = 0.6) +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
  labs(
    title = "Distribution Response",
    x = "Response",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = income_log)) +
  geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Income",
    x = "Log Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean <- df_clean %>% filter(id != 9432)
df_clean %>%
  ggplot(aes(x = income_log)) +
  geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Log Income",
    x = "Log Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = reorder(education, education, function(x) -length(x)))) +
  geom_bar(fill = "#4682B4", color = "white", alpha = 0.8, width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, color = "#333333") +
  labs(
    title = "Level of Education",
    x = "Educational Qualification",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )

total_spent is a variable created to show the total expenditure incurred by the various instances. Its marked asymmetry is due to the underlying asymmetry of the various variables that comprise it.

df_clean %>%
  ggplot(aes(x = total_spent)) +
  geom_histogram(bins = 40, fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Total Spent",
    x = "Total Spent",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df_clean %>%
  ggplot(aes(x = factor(children))) +
  geom_bar(fill = "#4682B4", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution Children",
    x = "Children",
    y = "Counts"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

As can be seen from the graphs, all the “mnt_” variables show strong asymmetry.

colonne_mnt <- df_clean %>%
  dplyr::select(starts_with("mnt")) %>%
  names()

lista_grafici <- map(colonne_mnt, function(col_name) {
  ggplot(df_clean, aes(x = .data[[col_name]])) + 
    geom_histogram(bins = 30, fill = "#4682B4", color = "white", alpha = 0.8) +
    scale_x_continuous(
      labels = label_dollar(prefix = "€ ", big.mark = ".", decimal.mark = ",")
    ) +
    labs(
      title = paste("Distribution", col_name), 
      x = "Amount spent",
      y = "Frequency"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      panel.grid.minor = element_blank()
    )
})

names(lista_grafici) <- colonne_mnt
walk(lista_grafici, print)

Recency is crucial for calculating the churn rate, indicating the number of days since the last purchase.

df_clean %>%
  ggplot(aes(x = recency)) +
  geom_histogram(binwidth = 5, fill = "#4682B4", color = "white", alpha = 0.8) +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  labs(
    title = "Distribution Recency",
    subtitle = "Days since the customer's last purchase",
    x = "Days since last purchase",
    y = "Number of Customers"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10))
  )

There is a strong correlation (0.64) between income and spending on wine (mnt_wines). High income is positively linked to purchases in stores (0.63) and from catalogs (0.59). There is a strong negative correlation (-0.61) between income and visits to the website. The correlation (0.74) is between mnt_meat_products and num_catalog_purchases (those who buy a lot of meat tend to do so via catalog). Those who spend on fish also tend to spend on fruit (0.59), meat (0.57), and desserts (0.59).

df_corr <- df_clean %>%
  dplyr::select(where(is.numeric)) %>%
  dplyr::select(-total_spent)%>%
  dplyr::select(-matches("ID|id"))

corr_matrix <- cor(df_corr, use = "complete.obs")

ggcorrplot(corr_matrix,
  method = "square",
  type = "lower",
  lab = FALSE,
  colors = c("#B2182B", "white", "#E41A1C"),
  title = "Correlation Matrix",
  ggtheme = theme_minimal()
) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    panel.grid = element_blank()
  )
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

classifica_correlazioni <- as.data.frame(corr_matrix) %>%
  
  rownames_to_column(var = "Var1") %>%
  pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Correlazione") %>%
  filter(Var1 < Var2) %>%
  arrange(desc(abs(Correlazione)))

print("Higher correlations")
## [1] "Higher correlations"
head(classifica_correlazioni, 10)
## # A tibble: 10 × 3
##    Var1              Var2                  Correlazione
##    <chr>             <chr>                        <dbl>
##  1 mnt_meat_products num_catalog_purchases        0.735
##  2 mnt_wines         num_store_purchases          0.640
##  3 income_log        mnt_wines                    0.638
##  4 mnt_wines         num_catalog_purchases        0.635
##  5 income_log        num_store_purchases          0.625
##  6 income_log        num_web_visits_month        -0.607
##  7 income_log        num_catalog_purchases        0.593
##  8 mnt_fish_products mnt_fruits                   0.592
##  9 mnt_fish_products mnt_sweet_products           0.586
## 10 mnt_fish_products mnt_meat_products            0.574
df_clean <- df_clean%>%
  dplyr::select(-dt_customer)
df_clean <- df_clean %>%
  mutate(
    education = fct_collapse(education,
      "Undergrad_Grad" = c("Basic", "Graduation")
    ))
table(df_clean$education)
## 
##       2n Cycle Undergrad_Grad         Master            PhD 
##            200           1168            364            479

Target

Customers who tend to spend more are more likely to accept the latest campaign offer, regardless of the product category.

df_clean %>%
  dplyr::select(response, starts_with("mnt")) %>%
  pivot_longer(cols = -response, names_to = "Product_Category", values_to = "Import") %>%
  mutate(response = as.factor(response)) %>%
  ggplot(aes(x = response, y = Import, fill = response)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  scale_y_log10(labels = label_dollar(prefix = "€", big.mark = ".", decimal.mark = ",")) +
  facet_wrap(~Product_Category, scales = "free_y") +
  scale_fill_manual(
    values = c("0" = "#9575CD", "1" = "#F06292"),
    labels = c("0" = "No", "1" = "Yes")
  ) +
  labs(
    title = "Impact of Expenditure on Response",
    x = "Did you accept the last offer?",
    y = "Amount Spent (Log Scale)"
  ) +
  theme_minimal() +
  theme(legend.position = "top")
## Warning in scale_y_log10(labels = label_dollar(prefix = "€", big.mark = ".", :
## log-10 transformation introduced infinite values.
## Warning: Removed 1261 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Those with more children tend not to respond, perhaps due to income or time constraints. Doctoral students are the best customers, while those with a basic education tend not to respond. The variable concerning marital status seems to show that married or cohabiting people are less likely to respond.

plot_pct <- function(df, var_name, titolo) {
  global_mean <- mean(as.numeric(as.character(df$response)), na.rm = TRUE)

  df %>%
    filter(!is.na(.data[[var_name]])) %>%
    ggplot(aes(x = .data[[var_name]], fill = as.factor(response))) +
    geom_bar(position = "fill", alpha = 0.9) +
    geom_hline(
      yintercept = global_mean,
      color = "white", linetype = "dashed", linewidth = 1
    ) +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_manual(
      values = c("0" = "#9575CD", "1" = "#F06292"),
      name = "Response",
      labels = c("No", "Yes")
    ) +
    labs(
      title = titolo,
      y = "Percentage of Response",
      x = ""
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "top"
    )
}


p1 <- plot_pct(df_clean, "education", "Response for Education Level")
p2 <- plot_pct(df_clean, "marital_status", "Response for Marital Status")
p3 <- plot_pct(df_clean, "children", "Response for Children")

print(p1)

print(p2)

print(p3)

Customers who respond to the latest offer have a median number of days since their last purchase that is significantly lower than those who do not respond to the latest campaign.

ggplot(df_clean, aes(x = as.factor(response), y = recency, fill = as.factor(response))) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = c("0" = "#9575CD", "1" = "#F06292")) +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
  labs(
    title = "Recency vs Response",
    x = "Response",
    y = "Recency"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

As expected, total spending is higher among consumers who are more likely to accept advertising campaigns.

df_clean %>%
  ggplot(aes(x = factor(response), y = total_spent, fill = factor(response))) +
  geom_boxplot(alpha = 0.8, outlier.colour = "#FF5555", outlier.size = 2) +
  scale_fill_manual(values = c("violet", "blue")) +
  labs(
    title = "Total spent for Response",
    x = "Response",
    y = "Total Spent"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, color = "#333333"),
    axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
    axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "none"
  )

Logistic Regression

df_logistic <- df_clean %>%
  mutate(response = as.factor(response))

set.seed(1926)

idx <- caret::createDataPartition(
  df_logistic$response,
  p = 0.7,
  list = FALSE
)

train <- df_logistic[idx, ]
train <- train %>%
  dplyr::select(-id)
test <- df_logistic[-idx, ]

test <- test%>%
  dplyr::select((-id))

Weights for imbalanced class

class_weights <- ifelse(
  train$response == 1,
  sum(train$response == 0) / nrow(train),
  sum(train$response == 1) / nrow(train)
)

Step wise on train set “The warning related to non-integer successes is expected due to the use of continuous class weights and does not affect model validity.”

full_model <- suppressWarnings(glm(
  response ~ age + education + marital_status + income_log +
    children + recency + mnt_wines + mnt_fruits +
    mnt_meat_products + mnt_fish_products + mnt_sweet_products +
    mnt_gold_prods + num_deals_purchases + num_web_purchases +
    num_catalog_purchases + num_store_purchases +
    num_web_visits_month + accepted_any,
  data = train,
  family = binomial,
  weights = class_weights
))
null_model <- suppressWarnings(glm(
  response ~ 1,
  data = train,
  family = binomial,
  weights = class_weights
))
best_model <- suppressWarnings(stepAIC(
  null_model,
  scope = list(lower = null_model, upper = full_model),
  direction = "both",
  trace = TRUE
))
## Start:  AIC=323.62
## response ~ 1
## 
##                         Df Deviance    AIC
## + accepted_any           1   467.19 245.98
## + num_catalog_purchases  1   499.97 278.75
## + mnt_meat_products      1   505.82 284.60
## + mnt_wines              1   505.92 284.70
## + recency                1   516.11 294.89
## + num_web_purchases      1   523.28 302.06
## + children               1   523.99 302.77
## + income_log             1   528.90 307.68
## + marital_status         3   528.12 310.90
## + mnt_fruits             1   533.39 312.17
## + mnt_sweet_products     1   533.60 312.38
## + mnt_fish_products      1   535.53 314.31
## + mnt_gold_prods         1   535.97 314.75
## + num_store_purchases    1   544.40 323.18
## <none>                       546.84 323.62
## + education              3   541.32 324.10
## + num_web_visits_month   1   546.68 325.46
## + num_deals_purchases    1   546.78 325.56
## + age                    1   546.84 325.62
## 
## Step:  AIC=283.39
## response ~ accepted_any
## 
##                         Df Deviance    AIC
## + recency                1   437.19 255.38
## + mnt_meat_products      1   447.69 265.89
## + num_catalog_purchases  1   450.32 268.52
## + marital_status         3   448.14 270.33
## + num_web_purchases      1   458.94 277.13
## + mnt_wines              1   459.60 277.80
## + children               1   460.13 278.33
## + mnt_fruits             1   460.98 279.18
## + mnt_sweet_products     1   461.97 280.17
## + mnt_fish_products      1   463.57 281.76
## + num_deals_purchases    1   463.74 281.94
## + education              3   460.22 282.42
## + income_log             1   464.81 283.01
## + mnt_gold_prods         1   465.10 283.29
## <none>                       467.19 283.39
## + num_web_visits_month   1   465.57 283.77
## + age                    1   467.06 285.26
## + num_store_purchases    1   467.17 285.37
## - accepted_any           1   546.84 361.04
## 
## Step:  AIC=266.45
## response ~ accepted_any + recency
## 
##                         Df Deviance    AIC
## + num_catalog_purchases  1   417.32 248.58
## + mnt_meat_products      1   417.48 248.74
## + marital_status         3   417.48 252.74
## + mnt_wines              1   426.88 258.14
## + num_web_purchases      1   427.23 258.49
## + children               1   430.48 261.74
## + mnt_fruits             1   430.60 261.86
## + mnt_sweet_products     1   430.71 261.97
## + education              3   429.37 264.63
## + mnt_fish_products      1   433.43 264.69
## + num_deals_purchases    1   433.96 265.22
## + income_log             1   434.31 265.57
## + mnt_gold_prods         1   434.99 266.25
## <none>                       437.19 266.45
## + num_web_visits_month   1   435.72 266.98
## + num_store_purchases    1   437.12 268.38
## + age                    1   437.15 268.41
## - recency                1   467.19 294.45
## - accepted_any           1   516.11 343.37
## 
## Step:  AIC=255.3
## response ~ accepted_any + recency + num_catalog_purchases
## 
##                         Df Deviance    AIC
## + num_web_visits_month   1   398.50 238.47
## + marital_status         3   398.89 242.86
## + num_deals_purchases    1   411.78 251.76
## + num_store_purchases    1   412.30 252.28
## + mnt_meat_products      1   413.31 253.28
## + education              3   409.98 253.96
## + num_web_purchases      1   414.70 254.68
## + income_log             1   415.23 255.21
## <none>                       417.32 255.30
## + age                    1   416.43 256.40
## + mnt_wines              1   416.87 256.84
## + children               1   416.99 256.96
## + mnt_fish_products      1   417.00 256.98
## + mnt_fruits             1   417.17 257.14
## + mnt_sweet_products     1   417.27 257.25
## + mnt_gold_prods         1   417.28 257.25
## - num_catalog_purchases  1   437.19 273.16
## - recency                1   450.32 286.30
## - accepted_any           1   463.94 299.92
## 
## Step:  AIC=240.57
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month
## 
##                         Df Deviance    AIC
## + marital_status         3   379.27 227.34
## + mnt_meat_products      1   383.27 227.35
## + children               1   394.30 238.37
## + education              3   392.13 240.20
## <none>                       398.50 240.57
## + mnt_fruits             1   396.52 240.59
## + num_store_purchases    1   396.63 240.70
## + mnt_sweet_products     1   397.07 241.15
## + age                    1   397.50 241.57
## + num_deals_purchases    1   398.05 242.13
## + mnt_wines              1   398.09 242.16
## + mnt_fish_products      1   398.22 242.29
## + num_web_purchases      1   398.25 242.32
## + income_log             1   398.26 242.33
## + mnt_gold_prods         1   398.27 242.35
## - num_web_visits_month   1   417.32 257.40
## - recency                1   432.73 272.81
## - num_catalog_purchases  1   435.72 275.80
## - accepted_any           1   445.35 285.42
## 
## Step:  AIC=234.03
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month + 
##     marital_status
## 
##                         Df Deviance    AIC
## + mnt_meat_products      1   365.05 221.81
## + children               1   376.14 232.91
## + num_store_purchases    1   376.16 232.93
## + education              3   373.20 233.97
## <none>                       379.27 234.03
## + mnt_fruits             1   377.79 234.55
## + age                    1   378.22 234.99
## + mnt_sweet_products     1   378.49 235.26
## + num_deals_purchases    1   378.80 235.56
## + mnt_fish_products      1   378.85 235.62
## + mnt_wines              1   378.87 235.64
## + num_web_purchases      1   378.99 235.76
## + mnt_gold_prods         1   379.03 235.80
## + income_log             1   379.05 235.82
## - marital_status         3   398.50 247.26
## - num_web_visits_month   1   398.89 251.65
## - recency                1   413.53 266.29
## - num_catalog_purchases  1   415.68 268.45
## - accepted_any           1   426.38 279.15
## 
## Step:  AIC=225.45
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month + 
##     marital_status + mnt_meat_products
## 
##                         Df Deviance    AIC
## + num_store_purchases    1   360.11 222.51
## + education              3   357.63 224.04
## <none>                       365.05 225.45
## + age                    1   364.06 226.46
## + children               1   364.17 226.57
## + mnt_gold_prods         1   364.38 226.78
## + num_deals_purchases    1   364.39 226.79
## + income_log             1   364.80 227.21
## + mnt_fish_products      1   364.83 227.24
## + mnt_wines              1   364.94 227.34
## + mnt_fruits             1   365.00 227.40
## + mnt_sweet_products     1   365.01 227.41
## + num_web_purchases      1   365.02 227.43
## - num_catalog_purchases  1   377.11 235.51
## - mnt_meat_products      1   379.27 237.67
## - marital_status         3   383.27 237.68
## - num_web_visits_month   1   395.37 253.77
## - recency                1   398.79 257.20
## - accepted_any           1   411.48 269.88
## 
## Step:  AIC=225.32
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month + 
##     marital_status + mnt_meat_products + num_store_purchases
## 
##                         Df Deviance    AIC
## + education              3   352.27 223.48
## + num_deals_purchases    1   357.89 225.10
## <none>                       360.11 225.32
## + children               1   359.04 226.25
## + num_web_purchases      1   359.15 226.36
## + age                    1   359.44 226.65
## + mnt_gold_prods         1   359.77 226.99
## + mnt_wines              1   359.84 227.05
## + mnt_fruits             1   359.89 227.10
## + income_log             1   359.96 227.17
## + mnt_fish_products      1   360.00 227.21
## + mnt_sweet_products     1   360.09 227.30
## - num_store_purchases    1   365.05 228.26
## - num_catalog_purchases  1   375.64 238.85
## - marital_status         3   379.85 239.07
## - mnt_meat_products      1   376.16 239.38
## - num_web_visits_month   1   386.07 249.29
## - recency                1   393.35 256.57
## - accepted_any           1   404.76 267.98
## 
## Step:  AIC=226.72
## response ~ accepted_any + recency + num_catalog_purchases + num_web_visits_month + 
##     marital_status + mnt_meat_products + num_store_purchases + 
##     education

The strongest predictor is accepted_any1,if a customer has accepted offers in the past, they are much more likely to respond again.

The level of education affects the probability of response (it increases as the level of education increases). Marital status also has an impact: single people are more likely to respond. Recencyas the number of days since the last purchase increases, the probability of response decreases.

The error made by the model is much lower than the error made by the model without variables, so it can be said that the variables are explaining the phenomenon well.

best_model
## 
## Call:  glm(formula = response ~ accepted_any + recency + num_catalog_purchases + 
##     num_web_visits_month + marital_status + mnt_meat_products + 
##     num_store_purchases + education, family = binomial, data = train, 
##     weights = class_weights)
## 
## Coefficients:
##                  (Intercept)                 accepted_any1  
##                    -2.209697                      1.944504  
##                      recency         num_catalog_purchases  
##                    -0.026595                      0.243715  
##         num_web_visits_month  marital_statusDivorced_Widow  
##                     0.364284                      0.092536  
##        marital_statusMarried        marital_statusTogether  
##                    -1.124751                     -1.145687  
##            mnt_meat_products           num_store_purchases  
##                     0.003287                     -0.115229  
##      educationUndergrad_Grad               educationMaster  
##                     0.286017                      0.580893  
##                 educationPhD  
##                     1.105900  
## 
## Degrees of Freedom: 1547 Total (i.e. Null);  1535 Residual
## Null Deviance:       546.8 
## Residual Deviance: 352.3     AIC: 226.7
prob_test <- predict(best_model, test, type = "response")

The most reliable variables appear to be accepted_any1, num_web_visits_month, recency, and the “Married” and “Together” modes of marital_status.

suppressWarnings(exp(cbind(OR = coef(best_model), confint(best_model))))
## Waiting for profiling to be done...
##                                     OR      2.5 %     97.5 %
## (Intercept)                  0.1097339 0.01904037  0.5811542
## accepted_any1                6.9901646 3.89528609 12.9685251
## recency                      0.9737560 0.96425081  0.9828336
## num_catalog_purchases        1.2759805 1.12140206  1.4637073
## num_web_visits_month         1.4394830 1.23896692  1.6919729
## marital_statusDivorced_Widow 1.0969526 0.50066277  2.4188736
## marital_statusMarried        0.3247335 0.16015785  0.6434069
## marital_statusTogether       0.3180054 0.14543035  0.6783637
## mnt_meat_products            1.0032920 1.00171513  1.0049890
## num_store_purchases          0.8911617 0.80535213  0.9826276
## educationUndergrad_Grad      1.3311152 0.47104501  3.8809111
## educationMaster              1.7876336 0.55188798  5.9663140
## educationPhD                 3.0219425 1.00942382  9.4254582

num_catalog_purchases, mnt_meat_products, and num_web_visits_month: these have the highest values; people who buy a lot of meat often use the catalog and visit the website. Since they are below 5, the model can distinguish the impact of each.

education and marital_status: these have values close to 1.0, meaning that education level and marital status are totally independent of spending habits in your dataset.

vif(best_model)
##                           GVIF Df GVIF^(1/(2*Df))
## accepted_any          1.142110  1        1.068695
## recency               1.091630  1        1.044811
## num_catalog_purchases 2.320847  1        1.523433
## num_web_visits_month  2.092538  1        1.446561
## marital_status        1.087119  3        1.014019
## mnt_meat_products     2.171771  1        1.473693
## num_store_purchases   1.553058  1        1.246217
## education             1.073621  3        1.011910
roc_obj <- roc(
  response = as.numeric(as.character(test$response)),
  predictor = prob_test,
  direction = "<"
)
## Setting levels: control = 0, case = 1
auc_value <- auc(roc_obj)

If you take a random customer who responded and one who did not respond, the model has an 85.7% chance of assigning a higher score to the one who actually responded. The curve rises rapidly toward the upper left corner, indicating that you can achieve high sensitivity without sacrificing too much specificity (it does not confuse no responses).

plot(
  roc_obj,
  col = "skyblue",
  lwd = 2,
  main = paste("ROC Curve - Logistic Model (AUC =", round(auc_value, 3), ")")
)
abline(a = 0, b = 1, lty = 2, col = "gray")

opt <- coords(
  roc_obj,
  x = "best",
  best.method = "youden",
  ret = c("threshold", "sensitivity", "specificity")
)

opt
##   threshold sensitivity specificity
## 1 0.4565964   0.7878788   0.7960993

Classification with best threshold

threshold <- opt["threshold"]
threshold <- as.numeric(opt[1])
pred_class <- ifelse(prob_test >= threshold, 1, 0)
pred_class <- factor(pred_class, levels = c(0, 1))
length(pred_class)
## [1] 663
length(test$response)
## [1] 663

Specificity (true negatives) identifies 80% of those who will not respond. Sensitivity is 0.788, indicating that you can identify about 79% of interested customers.

There are a total of 21 false negatives that would have responded yes, and the model tends to make the less serious error (115 false positives).

Confusion Matrix

cm <- confusionMatrix(
  pred_class,
  test$response,
  positive = "1"
)
cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 6) +
  scale_fill_gradient(low = "#F7FBFF", high = "skyblue") +
  labs(
    title = "Confusion Matrix – Logistic Regression",
    x = "Actual Class",
    y = "Predicted Class",
    fill = "Count"
  ) +
  theme_minimal(base_size = 14)

Metrics

cm <- confusionMatrix(pred_class, test$response, positive = "1")

precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
specificity <- cm$byClass["Specificity"]
balanced_acc <- cm$byClass["Balanced Accuracy"]

metrics_logistic <- tibble::tibble(
  Model = c("Logistica"),
  AUC = as.numeric(auc_value),
  Precision = as.numeric(cm$byClass["Precision"]),
  Recall = as.numeric(cm$byClass["Recall"]),
  F1 = as.numeric(cm$byClass["F1"]),
  Specificity = as.numeric(cm$byClass["Specificity"]),
  Balanced_Accuracy = as.numeric(cm$byClass["Balanced Accuracy"])
)

metrics_logistic
## # A tibble: 1 × 7
##   Model       AUC Precision Recall    F1 Specificity Balanced_Accuracy
##   <chr>     <dbl>     <dbl>  <dbl> <dbl>       <dbl>             <dbl>
## 1 Logistica 0.857     0.404  0.788 0.534       0.796             0.792

LASSO and RIDGE

Class imbalance was addressed using weights calculated previously with logistics. Elements of the minority class are assigned a weight equal to the frequency of the majority class, and viceversa. This forces the models to engage in the same way for observations from both classes.

cat_cols <- names(train)[sapply(train, function(x) is.factor(x) || is.character(x))]
cat_cols <- setdiff(cat_cols, "response")

for (col in cat_cols) {
  train[[col]] <- as.factor(train[[col]])
  test[[col]] <- factor(test[[col]], levels = levels(train[[col]]))
}


X_train <- model.matrix(response ~ ., data = train)[, -1]
y_train <- train$response

test_clean <- na.omit(test)
X_test <- model.matrix(response ~ ., data = test_clean)[, -1]
y_test <- test_clean$response


X_train_scaled <- scale(X_train)

train_means <- attr(X_train_scaled, "scaled:center")
train_sds <- attr(X_train_scaled, "scaled:scale")

X_test_scaled <- scale(X_test, center = train_means, scale = train_sds)

X_train_scaled <- X_train_scaled[, colnames(X_train_scaled) != "id"]
X_test_scaled <- X_test_scaled[, colnames(X_test_scaled) != "id"]

Lasso

set.seed(1926)
lasso_model <- suppressWarnings(cv.glmnet(
  x = X_train_scaled, 
  y = y_train,   
  family = "binomial",
  alpha = 1,
  weights = class_weights,  
  type.measure = "auc",
  standardize = FALSE       
))

prob_lasso <- predict(lasso_model, X_test_scaled, type = "response", s = "lambda.min")

The most predictive variables in this case are also accepted_Any1, num_web_visits_month, mnt_meat_products, and num_catalog_purchases. Recency, being in a relationship, and those who tend to go to the store respond less to the latest advertising campaign.

A slight negative effect is also given by the type of education, the number of children, and age.

lasso_coefs <- coef(lasso_model, s = "lambda.min")
lasso_coefs_df <- as.data.frame(as.matrix(lasso_coefs))
colnames(lasso_coefs_df) <- "Coefficiente"

selected_features <- lasso_coefs_df[lasso_coefs_df$Coefficiente != 0, , drop = FALSE]
print(selected_features)
##                              Coefficiente
## (Intercept)                   -0.80505974
## educationMaster                0.07414495
## educationPhD                   0.31827324
## marital_statusDivorced_Widow   0.03894996
## marital_statusMarried         -0.45274937
## marital_statusTogether        -0.40690552
## recency                       -0.69423248
## mnt_fruits                     0.05899273
## mnt_meat_products              0.56346952
## mnt_sweet_products             0.03120537
## num_deals_purchases            0.29460892
## num_web_purchases              0.02534825
## num_catalog_purchases          0.54774314
## num_store_purchases           -0.41233530
## num_web_visits_month           0.67618126
## children                      -0.29690081
## age                           -0.08559063
## accepted_any1                  0.74755015

Ridge

set.seed(1926)
ridge_model <- suppressWarnings(cv.glmnet(
  x = X_train_scaled, 
  y = y_train,   
  family = "binomial",
  alpha = 0,
  weights = class_weights,
  type.measure = "auc",
  standardize = FALSE       
))

prob_ridge <- predict(ridge_model, X_test_scaled, type = "response", s = "lambda.min")
ridge_coefs <- coef(ridge_model, s = "lambda.min")
ridge_coefs_df <- as.data.frame(as.matrix(ridge_coefs))
colnames(ridge_coefs_df) <- "Coefficiente"

selected_features <- ridge_coefs_df[ridge_coefs_df$Coefficiente != 0, , drop = FALSE]
print(selected_features)
##                              Coefficiente
## (Intercept)                  -0.697028903
## educationUndergrad_Grad      -0.009367024
## educationMaster               0.073622350
## educationPhD                  0.272118515
## marital_statusDivorced_Widow  0.090013229
## marital_statusMarried        -0.347226518
## marital_statusTogether       -0.318800317
## recency                      -0.595037221
## mnt_wines                     0.007763906
## mnt_fruits                    0.076548042
## mnt_meat_products             0.387010667
## mnt_fish_products            -0.038078655
## mnt_sweet_products            0.056292145
## mnt_gold_prods               -0.028271049
## num_deals_purchases           0.257222909
## num_web_purchases             0.087385594
## num_catalog_purchases         0.398843285
## num_store_purchases          -0.386560393
## num_web_visits_month          0.484419372
## children                     -0.246603893
## total_spent                   0.148510899
## age                          -0.109763078
## income_log                    0.010034507
## accepted_any1                 0.638172342

The Ridge performs in the same way as the Lasso, which tends to bring the coefficients of the less influential variables to 0. But even though these variables are less influential, the fact that the two models perform in the same way allows us to say that they do not introduce noise.

roc_lasso <- roc(y_test, as.numeric(prob_lasso))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_ridge <- roc(y_test, as.numeric(prob_ridge))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lasso <- auc(roc_lasso)
auc_ridge <- auc(roc_ridge)

cat("AUC Lasso:", round(auc_lasso, 3), "\n")
## AUC Lasso: 0.863
cat("AUC Ridge:", round(auc_ridge, 3), "\n")
## AUC Ridge: 0.86
roc_lasso_df <- data.frame(
  False_Positive_Rate = 1 - roc_lasso$specificities,
  True_Positive_Rate = roc_lasso$sensitivities,
  Model = "Lasso"
)

roc_ridge_df <- data.frame(
  False_Positive_Rate = 1 - roc_ridge$specificities,
  True_Positive_Rate = roc_ridge$sensitivities,
  Model = "Ridge"
)

roc_df <- rbind(roc_lasso_df, roc_ridge_df)
ggplot(roc_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, color = Model)) +
  geom_line(size = 1.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - Lasso vs Ridge Regression",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal(base_size = 14) +
  scale_color_manual(values = c("Lasso" = "blue", "Ridge" = "skyblue")) +
  theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

opt_lasso <- coords(roc_lasso, x = "best", best.method = "youden", ret = c("threshold"))
opt_ridge <- coords(roc_ridge, x = "best", best.method = "youden", ret = c("threshold"))

threshold_lasso <- as.numeric(opt_lasso)
threshold_ridge <- as.numeric(opt_ridge)
pred_lasso_class <- factor(ifelse(prob_lasso >= threshold_lasso, 1, 0), levels = c(0, 1))
pred_ridge_class <- factor(ifelse(prob_ridge >= threshold_ridge, 1, 0), levels = c(0, 1))
cm_lasso <- confusionMatrix(pred_lasso_class, y_test, positive = "1")
cm_ridge <- confusionMatrix(pred_ridge_class, y_test, positive = "1")
plot_cm <- function(cm, title) {
  cm_df <- as.data.frame(cm$table)
  colnames(cm_df) <- c("Predicted", "Actual", "Freq")

  ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
    geom_tile(color = "white") +
    geom_text(aes(label = Freq), size = 6) +
    scale_fill_gradient(low = "#F7FBFF", high = "blue") +
    labs(title = title, x = "Actual Class", y = "Predicted Class", fill = "Count") +
    theme_minimal(base_size = 14)
}

Both models have a higher number of false positives than false negatives.

plot_cm(cm_lasso, "Confusion Matrix – Lasso")

plot_cm(cm_ridge, "Confusion Matrix – Ridge")

metrics_lasso_ridge <- tibble(
  Model = c("Lasso", "Ridge"),
  AUC = c(as.numeric(auc_lasso), as.numeric(auc_ridge)),
  Precision = c(cm_lasso$byClass["Precision"], cm_ridge$byClass["Precision"]),
  Recall = c(cm_lasso$byClass["Recall"], cm_ridge$byClass["Recall"]),
  F1 = c(cm_lasso$byClass["F1"], cm_ridge$byClass["F1"]),
  Specificity = c(cm_lasso$byClass["Specificity"], cm_ridge$byClass["Specificity"]),
  Balanced_Accuracy = c(cm_lasso$byClass["Balanced Accuracy"], cm_ridge$byClass["Balanced Accuracy"])
)

metrics_lasso_ridge
## # A tibble: 2 × 7
##   Model   AUC Precision Recall    F1 Specificity Balanced_Accuracy
##   <chr> <dbl>     <dbl>  <dbl> <dbl>       <dbl>             <dbl>
## 1 Lasso 0.863     0.384  0.869 0.533       0.755             0.812
## 2 Ridge 0.860     0.373  0.859 0.520       0.746             0.803
coef_matrix <- coef(ridge_model, s = "lambda.min") %>% as.matrix()

coef_df <- data.frame(
  Feature = rownames(coef_matrix),
  Coefficient = coef_matrix[, 1]
)

coef_df <- coef_df %>%
  filter(Feature != "(Intercept)") %>%
  arrange(desc(abs(Coefficient)))


top_20_features <- coef_df %>%
  slice_max(order_by = abs(Coefficient), n = 20)


ggplot(top_20_features, aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  geom_col(aes(fill = Coefficient > 0)) +
  coord_flip() +
  labs(
    title = "Top Predictors Ridge",
    x = "Variables",
    y = "Coefficients"
  ) +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "blue", "FALSE" = "skyblue"),
    labels = c("TRUE" = "Positivo", "FALSE" = "Negativo"),
    name = "Influenza"
  )

coef_lasso_matrix <- coef(lasso_model, s = "lambda.min") %>% as.matrix()

lasso_df <- data.frame(
  Feature = rownames(coef_lasso_matrix),
  Coefficient = coef_lasso_matrix[, 1]
)


lasso_clean <- lasso_df %>%
  filter(Feature != "(Intercept)") %>%
  filter(Coefficient != 0) %>%
  arrange(desc(abs(Coefficient)))

cat("Il modello Lasso ha selezionato", nrow(lasso_clean), "variabili su", nrow(lasso_df) - 1, "\n")
## Il modello Lasso ha selezionato 17 variabili su 23
plot_data <- lasso_clean %>%
  slice_max(order_by = abs(Coefficient), n = 20)

ggplot(plot_data, aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  geom_col(aes(fill = Coefficient > 0)) +
  coord_flip() +
  labs(
    title = "Top Predictors Lasso",
    x = "Variables",
    y = "Coefficients"
  ) +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "blue", "FALSE" = "skyblue"),
    labels = c("TRUE" = "Positivo", "FALSE" = "Negativo"),
    name = "Influenza"
  )

GAM

gam_formula <- as.formula(
  response ~ s(age) + s(income_log) + s(recency) +
    s(mnt_wines) + s(mnt_fruits) + s(mnt_meat_products) +
    s(mnt_fish_products) + s(mnt_sweet_products) + s(mnt_gold_prods) +
    education + marital_status + children +
    num_deals_purchases + num_web_purchases + num_catalog_purchases +
    num_store_purchases + num_web_visits_month + accepted_any
)
test <- na.omit(test)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-4. For overview type '?mgcv'.
gam_model <- suppressWarnings(gam(
  gam_formula, 
  data = train, 
  family = binomial,
  weights = class_weights 
))

prob_gam <- predict(gam_model, test, type = "response")

Variables such as age, recency, mnt_fruits, mnt_fish_products, and mnt_sweet_products have an EDF of 1,000. Although the model has the freedom to fit curves, it has nevertheless chosen straight lines for these variables. The nonlinear relationships found concern income and meat products, which increase the probability of response up to a certain point. GOLD products also show a curvature, but it is not statistically significant (see p_value).

plot(gam_model, pages = 1, shade = TRUE, seWithMean = TRUE)

summary(gam_model)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## response ~ s(age) + s(income_log) + s(recency) + s(mnt_wines) + 
##     s(mnt_fruits) + s(mnt_meat_products) + s(mnt_fish_products) + 
##     s(mnt_sweet_products) + s(mnt_gold_prods) + education + marital_status + 
##     children + num_deals_purchases + num_web_purchases + num_catalog_purchases + 
##     num_store_purchases + num_web_visits_month + accepted_any
## 
## Parametric coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -3.51122    1.07933  -3.253 0.001141 ** 
## educationUndergrad_Grad       0.28403    0.55747   0.509 0.610402    
## educationMaster               0.62676    0.65171   0.962 0.336189    
## educationPhD                  1.32950    0.63563   2.092 0.036473 *  
## marital_statusDivorced_Widow  0.28438    0.43744   0.650 0.515631    
## marital_statusMarried        -1.05726    0.38150  -2.771 0.005583 ** 
## marital_statusTogether       -1.08111    0.42301  -2.556 0.010595 *  
## children                     -0.65194    0.32277  -2.020 0.043402 *  
## num_deals_purchases           0.26111    0.10788   2.420 0.015508 *  
## num_web_purchases             0.03116    0.08793   0.354 0.723076    
## num_catalog_purchases         0.26119    0.08480   3.080 0.002069 ** 
## num_store_purchases          -0.15982    0.06795  -2.352 0.018661 *  
## num_web_visits_month          0.39970    0.11495   3.477 0.000507 ***
## accepted_any1                 1.94302    0.36463   5.329 9.89e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df Chi.sq p-value    
## s(age)                1.000  1.000  0.179  0.6722    
## s(income_log)         5.352  6.561 13.094  0.0653 .  
## s(recency)            1.001  1.003 30.288  <2e-16 ***
## s(mnt_wines)          1.963  2.510  2.121  0.4534    
## s(mnt_fruits)         1.000  1.000  0.491  0.4834    
## s(mnt_meat_products)  4.364  5.461 11.948  0.0369 *  
## s(mnt_fish_products)  1.000  1.000  0.593  0.4415    
## s(mnt_sweet_products) 1.000  1.001  0.026  0.8717    
## s(mnt_gold_prods)     3.960  4.840  5.027  0.4837    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.462   Deviance explained = 44.3%
## UBRE = -0.75865  Scale est. = 1         n = 1548
roc_gam <- roc(y_test, prob_gam)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_gam <- auc(roc_gam)
opt_gam <- coords(roc_gam, x = "best", best.method = "youden", ret = "threshold")
threshold_gam <- as.numeric(opt_gam)
pred_gam_class <- factor(ifelse(prob_gam >= threshold_gam, 1, 0), levels = c(0, 1))
cm_gam <- confusionMatrix(pred_gam_class, y_test, positive = "1")
roc_gam_df <- data.frame(
  False_Positive_Rate = 1 - roc_gam$specificities,
  True_Positive_Rate = roc_gam$sensitivities
)
roc_gam <- roc(y_test, as.numeric(prob_gam))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lasso <- auc(roc_gam)

The ROC curve appears more solid, with the same AUC area as previous models.

ggplot(roc_gam_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
  geom_line(color = "red", size = 1.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - GAM",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal(base_size = 14)

opt_gam <- coords(roc_gam, x = "best", best.method = "youden", ret = c("threshold"))
threshold_gam <- as.numeric(opt_gam)
pred_gam_class <- factor(ifelse(prob_gam >= threshold_gam, 1, 0), levels = c(0, 1))
cm_gam <- confusionMatrix(pred_gam_class, y_test, positive = "1")

The GAM model has succeeded in improving the ability to detect True Negatives compared to Ridge and Lasso, and is also better at identifying False Positives.

cm_df <- as.data.frame(cm_gam$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 6) +
  scale_fill_gradient(low = "#F7FBFF", high = "#D62728") +
  labs(
    title = "Confusion Matrix – GAM Logistic Regression",
    x = "Actual Class",
    y = "Predicted Class",
    fill = "Count"
  ) +
  theme_minimal(base_size = 14)

metrics_gam <- tibble(
  Model = "GAM",
  AUC = as.numeric(auc_gam),
  Precision = cm_gam$byClass["Precision"],
  Recall = cm_gam$byClass["Recall"],
  F1 = cm_gam$byClass["F1"],
  Specificity = cm_gam$byClass["Specificity"],
  Balanced_Accuracy = cm_gam$byClass["Balanced Accuracy"]
)

print(metrics_gam)
## # A tibble: 1 × 7
##   Model   AUC Precision Recall    F1 Specificity Balanced_Accuracy
##   <chr> <dbl>     <dbl>  <dbl> <dbl>       <dbl>             <dbl>
## 1 GAM   0.869     0.491  0.788 0.605       0.856             0.822

s(recency) shows an almost perfect decline,each day of delay linearly “kills” the probability of success.

gam_summary <- summary(gam_model)

if (!is.null(gam_summary$s.table)) {
  smooth_table <- as.data.frame(gam_summary$s.table)
  stat_col_smooth <- grep("Chi\\.sq|F", colnames(smooth_table), value = TRUE)[1]

  smooth_data <- data.frame(
    Feature = rownames(smooth_table),
    Statistic = smooth_table[[stat_col_smooth]],
    Type = "Smooth"
  )
} else {
  smooth_data <- data.frame()
}

p_table <- gam_summary$p.table

if (!is.null(p_table) && nrow(p_table) > 0) {
  p_df <- as.data.frame(p_table)
  stat_col_linear <- grep("value", colnames(p_df), value = TRUE)[1]

  stats_values <- if (!is.na(stat_col_linear)) p_df[[stat_col_linear]] else p_table[, 3]

  linear_data <- data.frame(
    Feature = rownames(p_df),
    Statistic = stats_values^2,
    Type = "Parametric (Linear)"
  )

  linear_data <- linear_data %>% filter(Feature != "(Intercept)")
} else {
  linear_data <- data.frame()
}

gam_importance <- bind_rows(smooth_data, linear_data)

if (nrow(gam_importance) > 0) {
  gam_importance <- gam_importance %>%
    arrange(desc(Statistic)) %>%
    slice_max(order_by = Statistic, n = 20)

  ggplot(gam_importance, aes(x = reorder(Feature, Statistic), y = Statistic, fill = Type)) +
    geom_col() +
    coord_flip() +
    labs(
      title = "GAM",
      subtitle = "Statistica del test (Chi-quadro approx)",
      x = "Variables",
      y = "Importance (Test Statistic)",
      fill = "Tipo di Relazione"
    ) +
    theme_minimal() +
    scale_fill_manual(values = c("Smooth" = "red", "Parametric (Linear)" = "blue"))
} else {
  print("Nessuna variabile significativa trovata o modello vuoto.")
}

Random Forest

set.seed(1926)
rf_model <- randomForest(
  response ~ .,
  data = train,
  ntree = 500, 
  mtry = floor(sqrt(ncol(train) - 1)),
  importance = TRUE
)

prob_rf <- predict(rf_model, test, type = "prob")[, 2]

Random Forests has high levels of both specificity and sensitivity.

roc_rf <- roc(y_test, prob_rf)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rf <- auc(roc_rf)
cat("AUC Random Forest:", round(auc_rf, 3), "\n")
## AUC Random Forest: 0.873
roc_rf_df <- data.frame(
  False_Positive_Rate = 1 - roc_rf$specificities,
  True_Positive_Rate = roc_rf$sensitivities
)

ggplot(roc_rf_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
  geom_line(color = "darkgreen", size = 1.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - Random Forest",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal(base_size = 14)

opt_rf <- coords(roc_rf, x = "best", best.method = "youden", ret = c("threshold"))
threshold_rf <- as.numeric(opt_rf)
pred_rf_class <- factor(ifelse(prob_rf >= threshold_rf, 1, 0), levels = c(0, 1))
cm_rf <- confusionMatrix(pred_rf_class, y_test, positive = "1")

The model correctly identifies the vast majority of those who will not respond, reducing false positives to 94. It loses only 24 customers (false negatives).

cm_rf_df <- as.data.frame(cm_rf$table)
colnames(cm_rf_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_rf_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 6) +
  scale_fill_gradient(low = "#F7FBFF", high = "darkgreen") +
  labs(
    title = "Confusion Matrix – Random Forest",
    x = "Actual Class",
    y = "Predicted Class",
    fill = "Count"
  ) +
  theme_minimal(base_size = 14)

metrics_rf <- tibble(
  Model = "Random Forest",
  AUC = as.numeric(auc_rf),
  Precision = cm_rf$byClass["Precision"],
  Recall = cm_rf$byClass["Recall"],
  F1 = cm_rf$byClass["F1"],
  Specificity = cm_rf$byClass["Specificity"],
  Balanced_Accuracy = cm_rf$byClass["Balanced Accuracy"]
)

print(metrics_rf)
## # A tibble: 1 × 7
##   Model           AUC Precision Recall    F1 Specificity Balanced_Accuracy
##   <chr>         <dbl>     <dbl>  <dbl> <dbl>       <dbl>             <dbl>
## 1 Random Forest 0.873     0.444  0.758 0.560       0.833             0.795

Recency continues to be the best predictor, followed by income and total spent.

imp_matrix <- importance(rf_model)

rf_imp_df <- data.frame(
  Feature = rownames(imp_matrix),
  Importance = imp_matrix[, ncol(imp_matrix)]
)

rf_clean <- rf_imp_df %>%
  arrange(desc(Importance)) %>%
  slice_max(order_by = Importance, n = 20)

ggplot(rf_clean, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Feature Importance - Random Forest",
    x = "Variables",
    y = "Importance"
  ) +
  theme_minimal()

XGBoost

df_logistic <- df_logistic%>%
  dplyr::select(-id)
dummies <- caret::dummyVars(~., data = df_logistic %>% dplyr::select(-response))
X_xgb_full <- predict(dummies, newdata = df_logistic)
y_xgb_full <- as.numeric(df_logistic$response) - 1 

X_train_xgb <- X_xgb_full[idx, ]
y_train_xgb <- y_xgb_full[idx]
X_test_xgb <- X_xgb_full[-idx, ]
y_test_xgb <- y_xgb_full[-idx]

dtrain <- xgb.DMatrix(data = X_train_xgb, label = y_train_xgb)
dtest <- xgb.DMatrix(data = X_test_xgb, label = y_test_xgb)
set.seed(1926)
params <- list(
  objective = "binary:logistic",
  eval_metric = "auc",
  eta = 0.05,
  max_depth = 6,
  subsample = 0.8,
  colsample_bytree = 0.8
)

scale_pos_weight <- sum(y_train_xgb == 0) / sum(y_train_xgb == 1)
params$scale_pos_weight <- scale_pos_weight

xgb_model <- suppressWarnings(xgb.train(
  params = params,
  data = dtrain,
  nrounds = 500,
  watchlist = list(train = dtrain, test = dtest),
  print_every_n = 50,
  early_stopping_rounds = 50,
  verbose = 0
))

prob_xgb <- predict(xgb_model, dtest)

The ROC curve is closest to the upper left corner.

roc_xgb <- roc(y_test, prob_xgb)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_xgb <- auc(roc_xgb)
cat("AUC XGBoost:", round(auc_xgb, 3), "\n")
## AUC XGBoost: 0.888
roc_xgb_df <- data.frame(
  False_Positive_Rate = 1 - roc_xgb$specificities,
  True_Positive_Rate = roc_xgb$sensitivities
)

ggplot(roc_xgb_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) +
  geom_line(color = "blue", size = 1.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - XGBoost",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal(base_size = 14)

opt_xgb <- coords(roc_xgb, x = "best", best.method = "youden", ret = c("threshold"))
threshold_xgb <- as.numeric(opt_xgb)

It identifies 83% of respondents (true positives) and has the lowest number of false negatives among the models tested.

pred_xgb_class <- factor(ifelse(prob_xgb >= threshold_xgb, 1, 0), levels = c(0, 1))
cm_xgb <- confusionMatrix(pred_xgb_class, y_test, positive = "1")
cm_xgb_df <- as.data.frame(cm_xgb$table)
colnames(cm_xgb_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_xgb_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 6) +
  scale_fill_gradient(low = "#F7FBFF", high = "blue") +
  labs(
    title = "Confusion Matrix – XGBoost",
    x = "Actual Class",
    y = "Predicted Class",
    fill = "Count"
  ) +
  theme_minimal(base_size = 14)

metrics_xgb <- tibble(
  Model = "XGBoost",
  AUC = as.numeric(auc_xgb),
  Precision = cm_xgb$byClass["Precision"],
  Recall = cm_xgb$byClass["Recall"],
  F1 = cm_xgb$byClass["F1"],
  Specificity = cm_xgb$byClass["Specificity"],
  Balanced_Accuracy = cm_xgb$byClass["Balanced Accuracy"]
)

print(metrics_xgb)
## # A tibble: 1 × 7
##   Model     AUC Precision Recall    F1 Specificity Balanced_Accuracy
##   <chr>   <dbl>     <dbl>  <dbl> <dbl>       <dbl>             <dbl>
## 1 XGBoost 0.888     0.474  0.828 0.603       0.839             0.833

XGBoost ranks variables based on Gain (the incremental contribution of each variable to the improvement of the model). It confirmed that Meat and Wines, together with Recency, are the “golden triangle” for predicting your customers’ behavior.

importance_matrix <- xgb.importance(model = xgb_model)

plot_data <- importance_matrix %>%
  as.data.frame() %>%
  top_n(20, wt = Gain) %>%  
  mutate(Feature = reorder(Feature, Gain))

ggplot(plot_data, aes(x = Feature, y = Gain, fill = Gain)) +
  geom_col(show.legend = FALSE) +               
  coord_flip() +                                
  scale_fill_viridis_c(option = "magma") +      
  geom_text(aes(label = round(Gain, 3)),       
            hjust = -0.2, size = 3.5, color = "black") + 
  theme_minimal() +                            
  labs(
    title = "Feature Importance (XGBoost)",
    x = NULL,                                   
    y = "Importance (Gain)"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.y = element_text(size = 11, face = "bold")
  )

Robustness Check: 5-Fold Cross Validation

In order to test the robustness and generalization ability of the models used, the data is split into 5 folds and the models are trained.

XGBoost has the highest average AUC, but Random Forest is more consistent because it has a lower standard deviation (it performs more or less the same across the 5 folds). Logistics has a very similar average to XGBoost.

set.seed(1926)
k <- 5
folds <- sample(rep(1:k, length.out = nrow(train)))

cv_results <- data.frame(Model = character(), Fold = integer(), AUC = numeric())

for (i in 1:k) {
  test_idx <- which(folds == i)
  cv_train <- train[-test_idx, ]
  cv_test <- train[test_idx, ]
  
  w_pos <- sum(cv_train$response == 0) / nrow(cv_train)
  w_neg <- sum(cv_train$response == 1) / nrow(cv_train)
  cv_weights <- ifelse(cv_train$response == 1, w_pos, w_neg)

  m_log <- suppressWarnings(glm(response ~ ., data = cv_train, family = binomial, weights = cv_weights))
  p_log <- predict(m_log, cv_test, type = "response")
  auc_log <- auc(roc(cv_test$response, p_log, quiet = TRUE))

  m_rf <- suppressWarnings(randomForest(response ~ ., data = cv_train, ntree = 100))
  p_rf <- predict(m_rf, cv_test, type = "prob")[, 2]
  auc_rf <- auc(roc(cv_test$response, p_rf, quiet = TRUE))

  dummies_cv <- caret::dummyVars(~., data = cv_train %>% dplyr::select(-response))

  X_cv_train <- predict(dummies_cv, newdata = cv_train)
  y_cv_train <- as.numeric(cv_train$response) - 1
  dtrain_fold <- xgb.DMatrix(data = X_cv_train, label = y_cv_train)

  X_cv_test <- predict(dummies_cv, newdata = cv_test)
  y_cv_test <- as.numeric(cv_test$response) - 1
  dtest_fold <- xgb.DMatrix(data = X_cv_test, label = y_cv_test)

  scale_pos <- sum(y_cv_train == 0) / sum(y_cv_train == 1)
  params_cv <- list(
    objective = "binary:logistic",
    eval_metric = "auc",
    eta = 0.1,
    max_depth = 4, 
    scale_pos_weight = scale_pos
  )

  m_xgb <- suppressWarnings(xgb.train(params = params_cv, data = dtrain_fold, nrounds = 100, verbose = 0))
  p_xgb <- predict(m_xgb, dtest_fold)
  auc_xgb <- auc(roc(cv_test$response, p_xgb, quiet = TRUE))

  cv_results <- rbind(
    cv_results,
    data.frame(Model = "Logistic", Fold = i, AUC = as.numeric(auc_log)),
    data.frame(Model = "Random Forest", Fold = i, AUC = as.numeric(auc_rf)),
    data.frame(Model = "XGBoost", Fold = i, AUC = as.numeric(auc_xgb))
  )
}

cv_summary <- cv_results %>%
  group_by(Model) %>%
  summarise(
    Mean_AUC = mean(AUC),
    SD_AUC = sd(AUC),
    Min_AUC = min(AUC),
    Max_AUC = max(AUC)
  )

print(cv_summary)
## # A tibble: 3 × 5
##   Model         Mean_AUC SD_AUC Min_AUC Max_AUC
##   <chr>            <dbl>  <dbl>   <dbl>   <dbl>
## 1 Logistic         0.871 0.0189   0.847   0.895
## 2 Random Forest    0.862 0.0161   0.840   0.882
## 3 XGBoost          0.874 0.0316   0.841   0.912

Comparison between models

XGBoost is the best performing model in terms of pure predictive power, while GAM and Random Forest offer the best balance between accuracy and the ability to correctly identify “no” responses. If the goal is to avoid losing any potential customers at all costs, Lasso would be the best choice (recall). GAM is the most conservative and balanced model with the highest precision and F1 score.

All models converge on the same theory, with Recency, Accepted Any, and Meat Products driving the situation.

metrics_logistic <- metrics_logistic %>% mutate(Model = "Stepwise Logistic")
metrics_lasso_ridge <- metrics_lasso_ridge %>% mutate(Model = c("Lasso", "Ridge"))
metrics_rf <- metrics_rf %>% mutate(Model = "Random Forest")
metrics_gam <- metrics_gam %>% mutate(Model = "GAM")
metrics_xgb <- metrics_xgb %>% mutate(Model = "XGBoost") 


all_metrics_sorted <- bind_rows(
  metrics_logistic,
  metrics_lasso_ridge,
  metrics_rf,
  metrics_gam,
  metrics_xgb
) %>%
  dplyr::select(Model, everything()) %>%
  arrange(desc(AUC))                


all_metrics_final <- as.data.frame(all_metrics_sorted)
rownames(all_metrics_final) <- NULL  

print(all_metrics_final)
##               Model       AUC Precision    Recall        F1 Specificity
## 1           XGBoost 0.8879665 0.4739884 0.8282828 0.6029412   0.8386525
## 2     Random Forest 0.8725106 0.4437870 0.7575758 0.5597015   0.8333333
## 3               GAM 0.8688033 0.4905660 0.7878788 0.6046512   0.8563830
## 4             Lasso 0.8632692 0.3839286 0.8686869 0.5325077   0.7553191
## 5             Ridge 0.8598127 0.3728070 0.8585859 0.5198777   0.7464539
## 6 Stepwise Logistic 0.8572695 0.4041451 0.7878788 0.5342466   0.7960993
##   Balanced_Accuracy
## 1         0.8334677
## 2         0.7954545
## 3         0.8221309
## 4         0.8120030
## 5         0.8025199
## 6         0.7919890

Shap Values for XGBoost

The predictive powers of the Recency and Accepted_any variables and meat products are confirmed.

shap_long <- shap.prep(xgb_model = xgb_model, X_train = X_test_xgb)
shap.plot.summary(shap_long) +
  ggtitle("SHAP Summary Plot") +
  theme_minimal()

The red curve shows an almost perfect decline. Below 25 days of Recency, the impact is strongly positive (SHAP > 1). Above 50 days, the impact becomes consistently negative. The color of the dots indicates spending on meat.

shap.plot.dependence(data_long = shap_long, x = "recency",
                     color_feature = "auto") +
  ggtitle("SHAP Dependence: Recency")
## `geom_smooth()` using formula = 'y ~ x'

The curve reaches a plateau around €1,000 and then begins a slight downward parabola.

shap.plot.dependence(data_long = shap_long, x = "mnt_meat_products",
                     color_feature = "auto") +
  ggtitle("SHAP Dependence: Meat Products")
## `geom_smooth()` using formula = 'y ~ x'